clean__bac__clean_hs<-function(hs,year){
  hs<-unique(hs[,c("judet","unitate_de_invatamant")])
  hs$unitate_de_invatamant_new<-hs$unitate_de_invatamant
  
  #2017
  hs$unitate_de_invatamant_new<-gsub("á","A",hs$unitate_de_invatamant_new)
  
  hs$unitate_de_invatamant_new<-gsub("ș","S",hs$unitate_de_invatamant_new)
  
  hs$unitate_de_invatamant_new<-gsub("ț","T",hs$unitate_de_invatamant_new)
  
  hs$unitate_de_invatamant_new<-gsub("é","E",hs$unitate_de_invatamant_new)
  
  hs$unitate_de_invatamant_new<-gsub("„","\"",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("ó","O",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("ö","O",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("ö","O",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("ő","O",hs$unitate_de_invatamant_new)
  
  hs$unitate_de_invatamant_new<-gsub("’’","\"",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("’","\"",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub(",,","\"",hs$unitate_de_invatamant_new)
  
  
  
  #2016
  hs$unitate_de_invatamant_new<-gsub("“","\"",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("”","\"",hs$unitate_de_invatamant_new)
  
  hs$unitate_de_invatamant_new<-gsub("Á","A",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("ă","A",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("Ă","A",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("â","A",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("Â","A",hs$unitate_de_invatamant_new)
  
  hs$unitate_de_invatamant_new<-gsub("É","E",hs$unitate_de_invatamant_new)
  
  
  hs$unitate_de_invatamant_new<-gsub("Î","I",hs$unitate_de_invatamant_new)
  
  hs$unitate_de_invatamant_new<-gsub("Ó","O",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("Ö","O",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("Ő","O",hs$unitate_de_invatamant_new)
  
  hs$unitate_de_invatamant_new<-gsub("Ș","S",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("ş","S",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("Ş","S",hs$unitate_de_invatamant_new)
  
  hs$unitate_de_invatamant_new<-gsub("Ț","T",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("ţ","T",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("Ţ","T",hs$unitate_de_invatamant_new)
  
  #2014
  hs$unitate_de_invatamant_new<-gsub("I. C. PETRESCU: STALPENI","I. C. PETRESCU\", STALPENI",hs$unitate_de_invatamant_new)
  
  #2013
  hs$unitate_de_invatamant_new<-gsub("‘ ","\"",hs$unitate_de_invatamant_new)
  
  
  #Spacing et al
  hs$unitate_de_invatamant_new<-toupper(hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("\\.([A-Za-z])", "\\. \\1",  hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("\\s+(?=\\p{Pd})|(?<=\\p{Pd})\\s+", "", hs$unitate_de_invatamant_new, perl=TRUE) # suppress space before -
  hs$unitate_de_invatamant_new<-trimws(hs$unitate_de_invatamant_new, which = c("both")) #trim white space leading and lagging
  hs$unitate_de_invatamant_new<-gsub("\\s+", " ", trimws(hs$unitate_de_invatamant_new))
  
  #CHANGE WEIRD QUOTES INTO "
  hs$unitate_de_invatamant_new<-gsub("''-","\" ",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("\"-", "\" ",  hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("''","\"",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("'","\"",hs$unitate_de_invatamant_new)
  
  #Make some modifications to town names; remove old I-hat and replace with A-hat, lus some other changes
  hs$unitate_de_invatamant_new<-gsub("JIU\\.","JIU,",hs$unitate_de_invatamant_new) # CHANGE . to , FOR iasi
  hs$unitate_de_invatamant_new<-gsub("RM\\.","RAMNICU",hs$unitate_de_invatamant_new) 
  hs$unitate_de_invatamant_new<-gsub('TG\\.','TARGU',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('TG','TARGU',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("TIRG","TARG",hs$unitate_de_invatamant_new) 
  hs$unitate_de_invatamant_new<-gsub("SFINT","SFANT",hs$unitate_de_invatamant_new) 
  hs$unitate_de_invatamant_new<-gsub('SINGEORGIU','SANGEORGIU',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('SINMARTIN','SANMARTIN',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('SINTANA','SANTANA',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("RIMNICU","RAMNICU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('PINCOTA','PANCOTA',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('CIMPINA','CAMPINA',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('RISNOV','RASNOV',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('CIMPIA','CAMPIA',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('HIRSOVA','HARSOVA',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('CIMPULUNG','CAMPULUNG',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("PIATRA NEAMT","PIATRA-NEAMT",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("TARGU JIU","TARGU-JIU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("FIERBINTI-TARG","FIERBINTI",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("HIRLAU","HARLAU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("TIRNAVENI","TARNAVENI",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("CURTEA DE AG\\.","CURTEA DE ARGES",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("CURTEA DE AG","CURTEA DE ARGES",hs$unitate_de_invatamant_new)
  
  
  #make some modifications which will faciliate matching HS's over several years
  hs$unitate_de_invatamant_new<-gsub('GR\\. SC\\.','GRUPUL SCOLAR',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('GRUP SCOLAR','GRUPUL SCOLAR',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('LIC\\.','LICEUL',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('TEHN\\.','TEHNOLOGIC',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('GRUP SC\\.','GRUPUL SCOLAR',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("AL\\. I\\.","ALEXANDRU IOAN",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("A\\. I\\.","ALEXANDRU IOAN",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("GH\\. M\\.","GHEORGHE MUNTEANU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub(" GH\\.","GHEORGHE",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("BARTOK BELA","BELA BARTOK",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("C\\. BREDICEANU","CORIOLAN BREDICEANU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("C\\. DIACONOVICI","CONSTANTIN DIACONOVICI",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("C\\. NEGRI","COSTACHE NEGRI",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("D\\. CANTEMIR","DIMITRIE CANTEMIR",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("G\\. VRANCEANU","GHEORGHE VRANCEANU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("GR\\.","GRIGORE",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("M\\. EMINESCU","MIHAI EMINESCU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("V\\. ALECSANDRI","VASILE ALECSANDRI",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("G\\. APOSTU","GEORGE APOSTU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("A\\. SALIGNY","ANGHEL SALIGNY",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("C\\. D\\. NENITESCU","COSTIN D\\. NENITESCU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("ED\\. NICOLAU","EDMOND NICOLAU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("G-RAL","GENERAL",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("C-TIN","CONSTANTIN",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("D\\. PRAPORGESCU","DAVID PRAPORGESCU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("AL\\.","ALEXANDRU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("G\\. P\\.","GEORGE POP",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("I\\. ZOSSIMA","IORDACHE ZOSSIMA",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("TG\\.","TARGU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("TG","TARGU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("J\\. LEBEL","JOHANNES LEBEL",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("J\\. M\\. ELIAS","JACQUES M\\. ELIAS",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("C-TIN","CONSTANTIN",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("M\\. BASARAB","MATEI BASARAB",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("N\\. ONCESCU","NICOLAE ONCESCU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("SF\\.","SFANTUL",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("T\\. VLADIMIRESCU","TUDOR VLADIMIRESCU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("RM\\.","RAMNICU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("I\\. L\\.","ION LUCA",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("I\\. C\\. DRAGUSANU","ION CODRU DRAGUSANU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("J GREGOR TAJOVSKI","JOZEF GREGOR TAJOVSKI",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("S\\. HARET","SPIRU HARET",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("SF ","SFANTU ",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub(" TIMIS$","",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("C\\. BRANCUSI","CONSTANTIN BRANCUSI",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("A\\. IANCU","AVRAM IANCU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("J\\. KOZACEK","JOZEF KOZACEK",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("I\\. VULCAN","IOSIF VULCAN",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("M\\. VITEAZU","MIHAI VITEAZUL",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("S\\. VULCAN","SAMUIL VULCAN",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("A\\. SAGUNA","ANDREI SAGUNA",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("T\\. VUIA","TRAIAN VUIA",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("V\\. VOICULESCU","VASILE VOICULESCU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("L\\. BLAGA","LUCIAN BLAGA",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("M\\. KOGALNICEANU","MIHAIL KOGALNICEANU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("N\\. BOLCAS","NICOLAE BOLCAS",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("N\\. JIGA","NICOLAE JIGA",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("P\\. COSMA","PARTENIE COSMA",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("D\\. LEONIDA","DIMITRIE LEONIDA",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("O\\. GHIBU","ONISIFOR GHIBU",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("A\\. ROMAN","ALEXANDRU ROMAN",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("GHEORGHE MURGOCI","GHEORGHE MUNTEANU MURGOCI",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("N\\. COMANECI","NADIA COMANECI",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("CU PROGRAM SPORTIV","SPORTIV",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub("CU PROGRAM DE ATLETISM","SPORTIV",hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('"DIMITRIE TICHINDEAL"','"PREPARANDIA-DIMITRIE TICHINDEAL"',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('N\\. VASILESCU','NICOLAE VASILESCU',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('IND\\.','INDUSTRIAL',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('N PLESOIANU','NICOLAE PLESOIANU',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('C\\. ANGELESCU','CONSTANTIN ANGELESCU',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('DOBRESCU-ARGES','DOBRESCU',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('SZENT ERZSEBET','SFANTA ELISABETA',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('SAT CIORANII DE JOS\\. COMUNA CIORANI','CIORANI',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('CIORANII DE JOS','CIORANI',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('MANECIU-UNGURENI','MANECIU',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('SAT GHEABA\\. COMUNA MANECIU','MANECIU',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('ION I\\. C\\. BRATIANU','ION CONSTANTIN BRATIANU',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('I\\. C\\. BRATIANU','ION CONSTANTIN BRATIANU',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('ION C\\. BRATIANU','ION CONSTANTIN BRATIANU',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('TEHNLOGIC','TEHNOLOGIC',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('SINTANA','SANTANA',hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-gsub('BOLINTIN VALE','BOLINTIN-VALE',hs$unitate_de_invatamant_new)
  
  #remove village name and keep only town name
  hs$unitate_de_invatamant_new<-unlist(lapply(1:length(hs$unitate_de_invatamant_new), function(x)
    gsub('(SAT ).*',gsub('.+?(?=COMUNA)',"\\1",hs$unitate_de_invatamant_new[x],perl=TRUE),hs$unitate_de_invatamant_new[x],perl=T)))
  
  if (year==2014){
    hs$unitate_de_invatamant_new<-gsub('GALAT$','GALATI',hs$unitate_de_invatamant_new)
    hs$unitate_de_invatamant_new<-gsub('TARGU-MURES','TARGU MURES',hs$unitate_de_invatamant_new)
  }
  if (year==2019){
    hs[hs$judet=='MEHEDINTI',]$unitate_de_invatamant_new<-gsub('COLEGIUL TEHNOLOGIC$','COLEGIUL TEHNIC DE TRANSPORTURI AUTO',hs[hs$judet=='MEHEDINTI',]$unitate_de_invatamant_new)
  }
  #add space before first quotation
  hs$unitate_de_invatamant_new<-sub('(.*?)"','\\1 "', hs$unitate_de_invatamant_new)
  
  #add space after last quotation not followed by a coma, then delete all double spaces
  hs$unitate_de_invatamant_new<-sub('(\\".*?)"(?!\\,)','\\1" ',hs$unitate_de_invatamant_new,perl=T)
  hs$unitate_de_invatamant_new<-gsub("\\s+", " ", hs$unitate_de_invatamant_new)
  hs$unitate_de_invatamant_new<-trimws(hs$unitate_de_invatamant_new)
  
  data_bac_raw<-base::merge(data_bac_raw,hs,by=c("judet","unitate_de_invatamant"),all.x=T)
  data_bac_raw$unitate_de_invatamant_orig<-data_bac_raw$unitate_de_invatamant
  data_bac_raw$unitate_de_invatamant<-data_bac_raw$unitate_de_invatamant_new
  data_bac_raw<-data_bac_raw %>% select(-unitate_de_invatamant_new) %>% ungroup
  data_bac_raw$unitate_de_invatamant<-trimws(data_bac_raw$unitate_de_invatamant)
  
  return(data_bac_raw)
}